pegRNA Library Comparison

Author

Lance Parsons

Read data

  1. Get list of samples
Code
samples <- read_tsv("config/samples.tsv",
  col_types = list(
    sample_name = col_character(),
    cell_line = col_factor(),
    exogenous_rna = col_factor(),
    day = col_factor()
  )
)
units <- read_tsv("config/units.tsv",
  col_types = list(
    sample_name = col_character(),
    unit_name = col_character(),
    fq1 = col_character(),
    fq2 = col_character()
  )
)
sample_units <- dplyr::left_join(samples, units, by = "sample_name") %>%
  unite(sample_unit, sample_name, unit_name, remove = FALSE)
sample_units
  1. Read Samtools idxstats to get human coverage for normalization

Notes:

  • The counts include the total number of reads aligned, they are not limited to uniquely aligned reads.
  • The counts are reads, not pairs or fragments
Code
idxstats_exogenousrna_dir <-
  "results/samtools_idxstats/exogenous_rna/"

idxstats_human_dir <-
  "results/samtools_idxstats/Homo_sapiens.GRCh38.dna.primary_assembly/"

bowtie2_human_logs <-
  "results/logs/bowtie2/Homo_sapiens.GRCh38.dna.primary_assembly/"

idxstats <- tibble()

for (row in seq_len(nrow(sample_units))) {
  sample <- sample_units[row, ]$sample_unit

  # Read `idsxstats` for exogenous mapped reads
  exogenous_rna_stats <- read_tsv(
    file.path(idxstats_exogenousrna_dir, sprintf("%s.bam.idxstats", sample)),
    col_names = c(
      "sequence_name", "sequence_length",
      "mapped_reads", "unmapped_reads"
    ),
    col_types = "ciii"
  )
  exogenous_rna_mapped_reads <- exogenous_rna_stats %>%
    dplyr::filter(!sequence_name %in% c("*")) %>%
    dplyr::select(sequence_name, mapped_reads) %>%
    dplyr::mutate(sample = sample)

  # Read `idxstats` for human mapped reads
  human_stats <- read_tsv(
    file.path(idxstats_human_dir, sprintf("%s.bam.idxstats", sample)),
    col_names = c(
      "sequence_name", "sequence_length",
      "mapped_reads", "unmapped_reads"
    ),
    col_types = "ciii"
  )
  grch38_mapped_reads <- human_stats %>%
    dplyr::filter(!sequence_name %in% c("*")) %>%
    dplyr::select(mapped_reads) %>%
    sum()
  grch38_mapped_reads <- tibble(
    sequence_name = "grch38_mapped_reads",
    mapped_reads = grch38_mapped_reads,
    sample = sample
  )

  # Read bowtie2 logs for unmapped reads
  bowtie2_log <- readLines(
    file.path(bowtie2_human_logs, sprintf("%s.log", sample))
  )
  total_pairs <- strtoi(str_split(bowtie2_log[1], " ")[[1]][1])
  total_reads <- total_pairs * 2
  unmapped_reads <- tibble(
    sequence_name = "unmapped",
    mapped_reads = total_reads - grch38_mapped_reads$mapped_reads,
    sample = sample
  )

  # Consolidate counts for rows
  idxstats <- rbind(
    idxstats,
    exogenous_rna_mapped_reads,
    grch38_mapped_reads,
    unmapped_reads
  )
}
idxstats

Biotype comparison

  • count only fragments that were properly aligned
  • annotate with GENCODE gene model
  • primary alignments were counted, even if the fragments aligned multiple times
  • fragments aligning to multiple features were assigned to the feature that mostly closely overlapped with the fragment
Code
human_counts_dir <- "results/smrna_count/"
biotype_counts_files <- paste0(
  human_counts_dir,
  sample_units$sample_unit,
  "_first_proper_pair_biotype_count.txt"
)
exogenous_counts_dir <- "results/exogenous_rna_count/"
exogenous_counts_files <- paste0(
  exogenous_counts_dir,
  sample_units$sample_unit,
  "_idxstats.txt"
)

biotype_counts <- readr::read_tsv(
  biotype_counts_files[1],
  comment = "#",
  col_names = c("biotype", biotype_counts_files[1]),
  col_types = "ci"
)
exogenous_counts <- read_tsv(
  exogenous_counts_files[1],
  col_names = c("exogenous_rna", exogenous_counts_files[1]),
  col_types = "c-i-"
)
biotype_counts <- biotype_counts %>%
  dplyr::add_row(
    biotype = "exogenous_rna",
    "{biotype_counts_files[1]}" := sum(exogenous_counts[[exogenous_counts_files[1]]])
  )

for (i in 2:length(biotype_counts_files)) {
  biotype_sample <-
    readr::read_tsv(
      biotype_counts_files[i],
      comment = "#",
      col_names = c("biotype", biotype_counts_files[i]),
      col_types = "ci"
    )

  exogenous_counts_sample <-
    readr::read_tsv(
      exogenous_counts_files[i],
      col_names = c("exogenous_rna", exogenous_counts_files[i]),
      col_types = "c-i-"
    )

  biotype_sample <- biotype_sample %>%
    dplyr::add_row(
      biotype = "exogenous_rna",
      "{biotype_counts_files[i]}" :=
        sum(exogenous_counts_sample[[exogenous_counts_files[i]]])
    )

  biotype_counts <- biotype_counts %>%
    dplyr::full_join(biotype_sample, by = "biotype")
}

biotype_counts <- biotype_counts %>%
  rename_all(~ stringr::str_replace_all(
    ., human_counts_dir, ""
  )) %>%
  rename_all(~ stringr::str_replace_all(., "_first_proper_pair_biotype_count.txt", "")) %>%
  tidyr::pivot_longer(!biotype, names_to = "sample", values_to = "count")

biotype_counts
Code
p <- ggplot(
  data = subset(biotype_counts, !is.na(count)),
  aes(x = sample, y = count, fill = biotype)
) +
  geom_bar(stat = "identity", position = "fill") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
p

small RNA gene counts

  • count only fragments that were properly aligned
  • annotate with GENCODE gene model
  • primary alignments were counted, even if the fragments aligned multiple times
  • fragments aligning to multiple features were assigned to the feature that mostly closely overlapped with the fragment
  • exogenous RNA counts are total fragments that aligned
Code
human_counts_dir <- "results/smrna_count/"
gene_counts_files <- paste0(
  human_counts_dir,
  sample_units$sample_unit,
  "_first_proper_pair_gene_count.txt"
)

exogenous_counts_dir <- "results/exogenous_rna_count/"
exogenous_counts_files <- paste0(
  exogenous_counts_dir,
  sample_units$sample_unit,
  "_idxstats.txt"
)

gene_counts <- readr::read_tsv(
  gene_counts_files[1],
  comment = "#",
  col_names = c("gene", gene_counts_files[1]),
  col_types = "ci"
)
exogenous_counts <- read_tsv(
  exogenous_counts_files[1],
  col_names = c("gene", gene_counts_files[1]),
  col_types = "c-i-"
)
gene_counts <- rbind(gene_counts, exogenous_counts)

for (i in 2:length(gene_counts_files)) {
  gene_sample <-
    readr::read_tsv(
      gene_counts_files[i],
      comment = "#",
      col_names = c("gene", gene_counts_files[i]),
      col_types = "ci"
    )
  exogenous_counts_sample <- read_tsv(
    exogenous_counts_files[i],
    col_names = c("gene", gene_counts_files[i]),
    col_types = "c-i-"
  )
  gene_sample <- rbind(gene_sample, exogenous_counts_sample)
  gene_counts <- gene_counts %>%
    dplyr::full_join(gene_sample, by = "gene")
}

gene_counts <- gene_counts %>%
  rename_all(~ stringr::str_replace_all(
    ., human_counts_dir, ""
  )) %>%
  rename_all(~ stringr::str_replace_all(., "_first_proper_pair_gene_count.txt", ""))

gene_counts

Import sample metadata and counts into DESeq2

Read these counts into DESeq2 along with the sample metadata.

Set the design to ~ exogenous_rna + day + cell_line.

Code
dds <- DESeqDataSetFromMatrix(
  countData = gene_counts %>%
    tibble::column_to_rownames("gene") %>%
    replace(is.na(.), 0),
  colData = sample_units,
  design = ~ exogenous_rna + day + cell_line
)
converting counts to integer mode
Code
dds
class: DESeqDataSet 
dim: 56701 56 
metadata(1): version
assays(1): counts
rownames(56701): GAS5 SNORD30 ... MIR6134 ENSG00000270413
rowData names(0):
colnames(56): Parental_mastermix1_day1_rep1
  Parental_mastermix1_day1_rep2 ... Parental_PJY300_epegRNA_day2_rep3
  Parental_PJY300_epegRNA_day2_rep4
colData names(8): sample_unit sample_name ... fq1 fq2

Sample comparisons

Variance Stabalized Transformation

Code
vsd <- vst(dds, blind = FALSE)

Heatmap of sample-to-sample distances

Code
sample_dists <- dist(t(assay(vsd)))

sample_dist_matrix <- as.matrix(sample_dists)
rownames(sample_dist_matrix) <- paste(vsd$cell_line,
  vsd$exogenous_rna,
  paste0("day", vsd$day),
  sep = "-"
)
colnames(sample_dist_matrix) <- NULL
Code
colors <- colorRampPalette(rev(brewer.pal(9, "Blues")))(255)
pheatmap(sample_dist_matrix,
  clustering_distance_rows = sample_dists,
  clustering_distance_cols = sample_dists,
  col = colors
)

PCA plot of samples

Code
plotPCA(vsd, intgroup = c("cell_line", "exogenous_rna", "day"))

Differetial Expression

These analyses use size factors calculated by DESeq2. They are based off the gene counts and attempt to account for outliers.

Calculate the log2 fold change: cell line P1E10 vs Parental

Code
dds <- DESeq(dds, parallel = TRUE)
estimating size factors
estimating dispersions
gene-wise dispersion estimates: 4 workers
mean-dispersion relationship
final dispersion estimates, fitting model and testing: 4 workers
Code
res <- results(dds, alpha = 0.05)
res
log2 fold change (MLE): cell line P1E10 vs Parental 
Wald test p-value: cell line P1E10 vs Parental 
DataFrame with 56701 rows and 6 columns
                 baseMean log2FoldChange     lfcSE      stat      pvalue
                <numeric>      <numeric> <numeric> <numeric>   <numeric>
GAS5              3639368       0.233001 0.0263836   8.83130 1.03468e-18
SNORD30           1780896       0.200349 0.0316464   6.33087 2.43785e-10
SNORD49A          1355750       0.253307 0.0383049   6.61290 3.76862e-11
SNORD26            926681       0.153962 0.0315809   4.87514 1.08730e-06
SNHG1              767085       0.011249 0.0280887   0.40048 6.88803e-01
...                   ...            ...       ...       ...         ...
ENSG00000275017 0.0166226      0.0386993   2.93643  0.013179    0.989485
ENSG00000214669 0.0166226      0.0386993   2.93643  0.013179    0.989485
ENSG00000276467 0.0166226      0.0386993   2.93643  0.013179    0.989485
MIR6134         0.0166226      0.0386993   2.93643  0.013179    0.989485
ENSG00000270413 0.0166226      0.0386993   2.93643  0.013179    0.989485
                       padj
                  <numeric>
GAS5            2.77040e-17
SNORD30         3.75948e-09
SNORD49A        6.19251e-10
SNORD26         1.14213e-05
SNHG1           8.36197e-01
...                     ...
ENSG00000275017          NA
ENSG00000214669          NA
ENSG00000276467          NA
MIR6134                  NA
ENSG00000270413          NA
Code
summary(res)

out of 56700 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up)       : 4044, 7.1%
LFC < 0 (down)     : 3934, 6.9%
outliers [1]       : 12, 0.021%
low counts [2]     : 23085, 41%
(mean count < 1)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
Code
# List of exogenous genes to highlight
exogenous_rna_names <- gene_counts %>%
  dplyr::filter(str_detect(gene, "^PJY")) %>%
  pull(gene)

MA Plot

Code
plotMA(res, ylim = c(-5, 5))

Code
res_df <- res %>%
    as.data.frame() %>%
    rownames_to_column("gene") %>%
    dplyr::mutate(significant = padj < 0.05)

res_labelled <- res_df %>%
    dplyr::filter(gene %in% exogenous_rna_names)

ggplot(res_df,
       aes(x=log2(baseMean), y=log2FoldChange, colour=significant)) +
    geom_point() +
    ggrepel::geom_label_repel(data=res_labelled, aes(label=gene)) +
    theme_bw()
Warning: Removed 1 rows containing missing values (geom_point).

Plot of counts for a single gene (with lowest adjusted p-value)

Code
plotCounts(dds, gene = which.min(res$padj), intgroup = "cell_line")

Log fold change shrinkage

Run log fold change shrinkage to account for low counts.

MA Plot (lfc shrunk)

Code
res_lfc <- lfcShrink(dds,
  coef = "cell_line_P1E10_vs_Parental",
  type = "apeglm", parallel = TRUE
)
using 'apeglm' for LFC shrinkage. If used in published research, please cite:
    Zhu, A., Ibrahim, J.G., Love, M.I. (2018) Heavy-tailed prior distributions for
    sequence count data: removing the noise and preserving large differences.
    Bioinformatics. https://doi.org/10.1093/bioinformatics/bty895

MA Plot of shrunken log2 fold changes

Code
res_lfc_df <- res_lfc %>%
    as.data.frame() %>%
    rownames_to_column("gene") %>%
    dplyr::mutate(significant = padj < 0.05)

res_lfc_labelled <- res_lfc_df %>%
    dplyr::filter(gene %in% exogenous_rna_names)

ggplot(res_lfc_df,
       aes(x=log2(baseMean), y=log2FoldChange, colour=significant)) +
    geom_point() +
    ggrepel::geom_label_repel(data=res_lfc_labelled, aes(label=gene)) +
    theme_bw()
Warning: Removed 1 rows containing missing values (geom_point).